Install, load libraries, and create directories

install.packages("kableExtra")
library(tidyverse)
library(plotly)
library(ggpubr)
library(kableExtra)

Import the gapminder dataset and graph CO2 emissions for 1962

#Import dataset
gapminder_clean <- read_csv(paste0(getwd(), "/gapminder_clean.csv"))

#Graph CO2 emissions vs gdp for 1962
plotly::ggplotly(
gapminder_clean %>%
  filter(Year==1962) %>% 
  ggplot(aes(`CO2 emissions (metric tons per capita)`, gdpPercap)) + 
  geom_point() + geom_smooth() + ggtitle("CO2 emission vs GDP") + theme(plot.title = element_text(hjust = 0.5))
)
# dev.off()

Correlate CO2 levels with the GDP

#Correlate CO2 vs gdpPerCap for 1962
gapminder_clean %>%
  filter(Year==1962) %>% 
  summarise(correlation = stats::cor.test(gapminder_clean$`CO2 emissions (metric tons per capita)`, gapminder_clean$gdpPercap)$estimate, pval = stats::cor.test(gapminder_clean$`CO2 emissions (metric tons per capita)`, gapminder_clean$gdpPercap)$p.value) %>%
  mutate(
    correlation = formatC(correlation, format = NULL, digits = 3),
    pval = formatC(pval, format = "e", digits = 2) # Adjust digits as needed
  ) %>%
  kbl(caption = "<center><strong>Correlation (CO2 vs GDP), 1962<center>", align = 'c', caption.align = 'c') %>%
  kable_classic(html_font = "Cambria") 
Correlation (CO2 vs GDP), 1962
correlation pval
0.813 2.93e-280
  # correlation      pval
#         <dbl>     <dbl>
# 1       0.813 2.93e-280

#Calculate correlation for the CO2 vs gdpPerCap for all years and find the highest year
  gapminder_clean %>%
  group_by(Year) %>%
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  summarize(
    correlation = cor.test(`CO2 emissions (metric tons per capita)`, gdpPercap)$estimate,
    pval = cor.test(`CO2 emissions (metric tons per capita)`, gdpPercap)$p.value
  ) %>%
    mutate(
    correlation = formatC(correlation, format = NULL, digits = 3),
    pval = formatC(pval, format = "e", digits = 2) # Adjust digits as needed
  ) %>%
    arrange(desc(correlation)) %>%
    slice_head(n=5)%>%
  kbl(caption = "<center><strong>Correlation(CO2 vs GDP)<center>", align = 'c', caption.align = 'center') %>%
  kable_classic(html_font = "Cambria") 
Correlation(CO2 vs GDP)
Year correlation pval
1967 0.939 3.40e-53
1962 0.926 1.13e-46
1972 0.843 1.82e-32
1982 0.817 5.57e-29
1987 0.81 3.90e-28
  # Year correlation     pval
#   <dbl>       <dbl>    <dbl>
# 1  1967       0.939 3.40e-53

Plotly graph for the highest correlated year

#Plotly for the highest correlated year
 plotly::ggplotly(
   gapminder_clean %>%
  filter(Year==1967, !is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  ggplot(aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap)) +
  geom_point(aes(color = continent, size = gdpPercap))
 )
#Not much correlation between continent & CO2 emissions 
# dev.off()

What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)

 plotly::ggplotly(
gapminder_clean %>%
  filter(!is.na(`Energy use (kg of oil equivalent per capita)`) & !is.na(continent)) %>%
  rename(Energy_use = `Energy use (kg of oil equivalent per capita)`) %>%
  ggboxplot(x = "continent", y = "Energy_use", color = "continent") + theme(height = 3, width = 2) + 
  stat_compare_means(method = "anova", , label.y = 17000, label.x = 3) +
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = ".all.", label.y = 16000) 
 )
## There is a statistically significant relationship between continent and energy use

###Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)

plotly::ggplotly(
gapminder_clean %>%
    filter(Year > 1990, continent %in% c("Asia", "Europe"), !is.na(`Imports of goods and services (% of GDP)`) & !is.na(continent)) %>%
  group_by(continent) %>%
  rename(Imports_goods_services_of_GDP = `Imports of goods and services (% of GDP)`) %>%
  ggboxplot(x = "continent", y = "Imports_goods_services_of_GDP", color = "continent",  add = "jitter", palette = "jco", legend = "none") +
  stat_compare_means(method = "t.test", label.y = 210) +
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = "Asia", label.y = 190) 
 )
##No significant difference in Imports between Asia and Europe

###What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

plotly::ggplotly(
  gapminder_clean %>%
  group_by(Year) %>%
  select(Year, `Country Name`, `Population density (people per sq. km of land area)`) %>%
  filter(!is.na(`Population density (people per sq. km of land area)`) & !is.na(`Country Name`)) %>%
  arrange(Year, desc(`Population density (people per sq. km of land area)`)) %>%
  slice_max(n=1, order_by = `Population density (people per sq. km of land area)`) %>%
  ungroup() %>%
  ggplot(aes(x = `Country Name`, fill= `Country Name`)) +
    geom_bar() + ggtitle("Population Density counts(1962-2007)") + 
    ylab("Highest ranked(count)") +
    xlab("Country") +
    theme_classic(base_size = 12) +
    scale_y_continuous(expand=c(0, 0), limits=c(0, NA)) +
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) 
    )
#`Country Name`       n
#   <chr>            <int>
# 1 Macao SAR, China     5
# 2 Monaco               5

###What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ between 1962 and 2007?

plotly::ggplotly(
  gapminder_clean %>%
    select(Year, `Country Name`, `Life expectancy at birth, total (years)`) %>%
  filter(!is.na(`Life expectancy at birth, total (years)`) & !is.na(`Country Name`), Year %in% c(1962, 2007)) %>%
  group_by(`Country Name`) %>%
  arrange(Year) %>%
  summarise(Life_exp_change = (`Life expectancy at birth, total (years)`[Year==2007]) - (`Life expectancy at birth, total (years)`[Year==1962]))%>%
  filter(!is.na(Life_exp_change)) %>%
  arrange(desc(Life_exp_change)) %>%
  ungroup() %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(x = `Country Name`,-Life_exp_change), y = Life_exp_change, fill= `Country Name`)) +
    geom_col() + ggtitle("Life Expectancy(1962-2007)") + 
    ylab("Increase in life expectancy(years)") +
    xlab("Country") +
    theme_classic(base_size = 14) + 
scale_y_continuous(expand=c(0, 0), limits=c(0, NA)) +
    theme(legend.position = "none", plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.5))
)
#Maldives